In [2]:
import pandas as pd
import numpy as np
import json
from bokeh.resources import INLINE
from ipywidgets import interact
from bokeh.io import push_notebook ,output_notebook , show , output_file
from bokeh.plotting import figure
from bokeh.models.widgets import Select
from bokeh.models import HoverTool, ColumnDataSource , CheckboxGroup , CustomJS
from bokeh.layouts import widgetbox , row , layout
import seaborn as sns
%matplotlib inline
sns.set(style="white", palette="muted", color_codes=True)
store = pd.HDFStore('../../data/processed/orbf_benin.h5')
data = store['data']
store.close()
colormap = json.load(open('../../references/departements_colors.json'))
In [3]:
len(data)
a = data.groupby(['filetype_name' , 'entity_id' , 'period']).apply(len)
In [4]:
len(a['Facture Mensuelle PMA'])
Out[4]:
In [5]:
data.filetype_name.value_counts()
Out[5]:
In [6]:
def make_timeline_data(data) :
out = len(data)
data['correction'] = (data.indicator_claimed_value == data.indicator_verified_value)
perc_right = sum(data['correction']) / len(data)
return pd.DataFrame([[out, perc_right]])
timeline_data = data.groupby(['entity_name' , 'entity_type' ,'period']).apply(make_timeline_data).reset_index()
timeline_data.columns = ['entity_name', 'entity_type' ,'date' , 'level_1' , 'value' , 'percent_right']
In [7]:
def make_fac_desc(data) :
start_reporting = min(data.period)
n_values = len(data)
entity_type = data.entity_type.unique()[0]
data['correction'] = (data.indicator_claimed_value == data.indicator_verified_value)
perc_right = sum(data['correction']) / len(data)
return pd.DataFrame([[start_reporting ,n_values , entity_type , perc_right]])
fac_desc = data.groupby('entity_name').apply(make_fac_desc).reset_index()
fac_desc.columns = ['entity_name' , 'level_1' ,'date_start' , 'n_values' , 'entity_type' , 'percent_right']
In [8]:
result = fac_desc.sort_values(['entity_type' ,'date_start', 'n_values' , 'entity_name'], ascending=[ 0 , 0 , 1 , 0])
In [9]:
timeline_data.entity_name_cat = data.entity_name.astype('category').cat.set_categories(result['entity_name'], ordered=True)
In [10]:
colors = []
for i in range(len(timeline_data)) :
colors.append(colormap[timeline_data.loc[i , 'entity_type']])
We first look at the complete data to see how frequently correction happens in records.
The color is the type of facility we are considering. The shade is the percentage of reported indicator values that have been validated without being corrected.
In [12]:
#output_notebook(INLINE)
output_file('fac_data_quality')
In [ ]:
print(np.median(timeline_data.percent_right))
print(np.average(timeline_data.percent_right))
print(np.average(timeline_data.percent_right[timeline_data.percent_right < 1]))
print(np.median(timeline_data.percent_right[timeline_data.percent_right < 1]))
In [14]:
ax = sns.distplot(timeline_data.percent_right[timeline_data.percent_right < 1] , color="b" , bins=12)
ax.set(xlabel='% of correct indicators')
sns.plt.xlim(0,1.1)
ax.figure.savefig("../../reports/figures/correct_dist.pdf", dpi=1200)
In [13]:
source = ColumnDataSource(data=dict(
xname=timeline_data.date.astype(str).tolist(),
yname=timeline_data.entity_name.tolist() ,
colors=colors,
alphas= timeline_data.percent_right.tolist() ,
type_fac = timeline_data.entity_type ,
))
TOOLS = "hover,save,pan,box_zoom,wheel_zoom,reset"
p = figure(title="Monthly Reporting by facility",
x_axis_location="above", tools=TOOLS,
x_range=sorted(list(set(timeline_data.date.astype(str).tolist()))),
y_range=result['entity_name'].astype(str).tolist())
p.plot_width = 800
p.plot_height = 1200
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "7pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = np.pi/3
p.rect('xname', 'yname', 0.9, 0.9, source=source,
color='colors', alpha='alphas', line_color=None,
hover_line_color='black', hover_color='colors')
p.select_one(HoverTool).tooltips = [
('Fosa', '@yname') ,
('Period' , '@xname') ,
('% Right' , '@alphas')
]
show(p)
We can see data has been entered since March 2012 in a group of facilities, and is entered regularly until november 2016. A second group of facilities start reporting in July 2015 and appear to stop reporting one year later. Some facilites start reporting around April 2014 and have been reporting since then.
We see facility specific patterns, with some facilities being consistently less corrected than others. This can be added to a period effect, withy apparently less correction at the end of periods. This could be interrogated.
In [16]:
zone_color_dico = json.load(open('../../references/departements_colors.json'))
data['parent_geozone_name_color'] = ""
data['indicator_color'] = ""
for i in sorted(data.parent_geozone_name.unique().tolist()) :
data.loc[data.parent_geozone_name == i , 'parent_geozone_name_color'] = zone_color_dico[i]
In [17]:
def received_amounts(data):
return data.indicator_montant.sum(skipna = True)
amount_fac = data.groupby(['entity_id' , 'date' , 'entity_type' , 'entity_name' , 'parent_geozone_name' , 'parent_geozone_name_color']).apply(received_amounts)
amount_fac = amount_fac[amount_fac < 50000000]
amount_fac = amount_fac.reset_index()
amount_fac.columns = ['entity_id' , 'date' , 'entity_type' , 'entity_name' ,'parent_geozone_name' , 'parent_geozone_name_color' , 'value']
In [18]:
ax = sns.distplot(amount_fac.value , color="b" , bins=50)
ax.set(xlabel='% of correct indicators')
sns.plt.xlim(0,)
ax.figure.savefig("../../reports/figures/payment_distribution.pdf", dpi=1200)
In [19]:
gammas = sns.load_dataset("gammas")
In [24]:
gammas.head()
Out[24]:
In [22]:
amount_fac.head()
Out[22]:
In [21]:
ax = sns.tsplot(data = amount_fac , time = 'date' , value = 'value' , color="b" )
ax.set(xlabel='% of correct indicators')
#sns.plt.xlim(0,)
#ax.figure.savefig("../../reports/figures/payment_distribution.pdf", dpi=1200)
Out[21]:
In [12]:
def make_multiline_data(data):
xs = [data.date.tolist()]
ys = [data.value.tolist()]
color = [data.parent_geozone_name_color.tolist()]
return pd.DataFrame([[xs[0] , ys[0]]])
amount_fac = amount_fac.groupby(['entity_id' ,'entity_type' , 'entity_name' , 'parent_geozone_name' , 'parent_geozone_name_color']).apply(make_multiline_data)
amount_fac = amount_fac.reset_index()
amount_fac.columns = ['entity_id' ,'entity_type' , 'entity_name' , 'parent_geozone_name' , 'parent_geozone_name_color', 'level' , 'xs' , 'ys']
In [13]:
dept = ['All'] + sorted(data.parent_geozone_name.unique())
indics = ['All'] + sorted(data.indicator_label.unique())
In [14]:
output_notebook(INLINE)
In [15]:
source = ColumnDataSource(data=dict(xs = [] , ys = [] , color = [], facility_name = [] , legend = []))
TOOLS = "hover,save,pan,box_zoom,wheel_zoom,reset"
p = figure(x_axis_type="datetime", plot_height=600, plot_width=700, title="Evolution of amounts paid" , tools=TOOLS)
p.grid.grid_line_alpha=0.3
p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = 'Amount Paid'
p.multi_line('xs' , 'ys', source = source , color='color' , alpha=0.4)
p.select_one(HoverTool).tooltips = [("Facility", "@facility_name") ,
("Département" , "@legend")
]
def update(Zones = "All"):
df = amount_fac
if (Zones != "All"):
df = df[df.parent_geozone_name.str.contains(Zones)==True]
source.data = dict(
color = df['parent_geozone_name_color'].tolist() ,
xs=df['xs'].tolist(),
ys=df['ys'].tolist(),
facility_name = df['entity_name'].tolist(),
legend = df['parent_geozone_name'].tolist(),
)
push_notebook()
show(p, notebook_handle=True)
update()
In [16]:
interact(update, Zones=dept)
Out[16]:
There are large variations in the amounts of money paid to different facilities for RBF. Variations happen within facilities as well as between different facilites. Amplitudes of variations do not appear to have evidently defined patterns.
In [17]:
verif_data = data
verif_data['difference_indicateur'] = verif_data.indicator_verified_value - verif_data.indicator_claimed_value
verif_data['perc_variation_indicator'] = verif_data['difference_indicateur'] / verif_data['indicator_claimed_value']
In [18]:
verif_data = verif_data[(verif_data.indicator_claimed_value < 5000.0) &
(verif_data.perc_variation_indicator < 20)
]
In [19]:
output_notebook(INLINE)
In [20]:
source = ColumnDataSource(data=dict(xs = [] , ys = [] , facility_name = [] , departement = [] , indicator = []))
TOOLS = "hover,save,pan,box_zoom,wheel_zoom,reset"
p = figure(plot_height=600, plot_width=700, title="Evolution of amounts paid" , tools=TOOLS)
p.grid.grid_line_alpha=0.3
p.xaxis.axis_label = 'Reported Amount'
p.yaxis.axis_label = '% Variation'
p.circle('xs' , 'ys', source = source , alpha=0.4)
p.select_one(HoverTool).tooltips = [("Facility", "@facility_name") ,
("Département" , "@departement") ,
('Indicator' , "@indicator")
]
def update(departement = "All" , Indicateurs = 'All'):
df = verif_data
if (departement != "All"):
df = df[df.parent_geozone_name.str.contains(departement)==True]
indics = ['All'] + sorted(df.indicator_label.unique())
if (Indicateurs != "All"):
df = df[df.indicator_label == Indicateurs]
source.data = dict(
xs=df['indicator_claimed_value'].tolist(),
ys=df['perc_variation_indicator'].tolist(),
facility_name = df['entity_name'].tolist(),
departement = df['parent_geozone_name'].tolist(),
indicator = df['indicator_label'].tolist()
)
push_notebook()
show(p, notebook_handle=True)
update()
In [21]:
interact(update, departement=dept , Indicateurs = indics)
Out[21]:
In [22]:
def correct_amount(data) :
data.indicator_claimed_value = sum(data.indicator_claimed_value)
data.difference_indicateur = sum(data.indicator_verified_value - data.indicator_claimed_value)
data.claimed_montant = sum(data.indicator_claimed_value * data.indicator_tarif)
data.difference_montant = sum(data.indicator_montant - data.claimed_montant)
return pd.DataFrame([[data.claimed_montant.tolist() ,
data.difference_montant.tolist() ,
data.difference_indicateur.tolist() ,
data.indicator_claimed_value.tolist()[0]]])
verif_data_report = data.groupby(['entity_id' , 'entity_name' , 'parent_geozone_name' ,'date']).apply(correct_amount)
verif_data_report = verif_data_report.reset_index()
verif_data_report.columns = ['entity_id' , 'entity_name' , 'parent_geozone_name' , 'date' , 'level_5',
'claimed_montant' , 'difference_montant' , 'difference_indicateur' , 'claimed_indicateur']
verif_data_report['perc_variation_montant'] = verif_data_report['difference_montant'] / verif_data_report['claimed_montant']
#verif_data_report = verif_data_report[(np.abs(verif_data_report.difference_montant) < 500000000)]
print(np.median(verif_data_report.difference_montant))
In [119]:
verif_data_report_plot = verif_data_report[(np.abs(verif_data_report.difference_montant) <10000000000)]
ax = sns.distplot(verif_data_report_plot.difference_montant , color="b")
ax.set(xlabel='Correction of payment after data validation')
ax.figure.savefig("../../reports/figures/payment_correction.png", dpi=1200)
In [118]:
np.median(verif_data_report.difference_montant[(np.abs(verif_data_report.difference_montant) <10000000000)])
Out[118]:
In [ ]:
ax = sns.distplot(timeline_data.percent_right[timeline_data.percent_right < 1] , color="b" , bins=12)
ax.set(xlabel='% of correct indicators')
sns.plt.xlim(0,1.1)
ax.figure.savefig("../../reports/figures/correct_dist.pdf", dpi=1200)
In [131]:
import matplotlib.pyplot as plt
sns.despine(left=True)
verif_data_report_plot2 = verif_data_report_plot[verif_data_report_plot.entity_name.isin(['Csa Dekanme Cs' , 'Tohoue Csc' , 'Bembe Di' , 'Zounta Mi'])]
g = sns.FacetGrid(verif_data_report_plot2, col="entity_name", col_wrap=2, size=5 , sharex = False , sharey = False )
g = g.map(sns.distplot, "difference_montant")
g.fig.savefig("../../reports/figures/facility_correct.pdf", dpi=1000)
In [ ]: